################################new section classification
# import warnings
# warnings.filterwarnings("ignore")
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import nltk
from nltk.stem import PorterStemmer
porter = PorterStemmer()
from nltk.corpus import wordnet
nltk.download('omw-1.4')
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
# import argparse
# parser = argparse.ArgumentParser()
# parser.add_argument("--n", help="SDG K")
# args = parser.parse_args()
#
# number = int(args.n)
def get_category(x):
if x == "Strongly Misaligned":
return "Misaligned"
elif x == "Strongly Aligned":
return "Aligned"
else:
return x
def stem_sentences(x):
tokenized_words = x.split(" ")
tokenized_sentence = []
for word in tokenized_words:
if len(wordnet.synsets(word)) != 0:
tokenized_sentence.append(porter.stem(word))
tokenized_sentence = " ".join(tokenized_sentence)
return tokenized_sentence
# creating bag of words representations from description
# Create a Bag of Words Model with Sklearn
# import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def get_BoW(df_wiki_node, column_name, param1=5, param2=.99):
corpus = df_wiki_node[column_name]
# sentence_1="*&^$This is a good job.{{I will not miss it for anything"
# sentence_2="This is not good at all}}, hello my name misses a w"
# CountVec = CountVectorizer(ngram_range=(1,2), # to use bigrams ngram_range=(2,2)
# stop_words='english')
CountVec = CountVectorizer(min_df=param1,max_df=param2, ngram_range=(1, 2), stop_words='english')
#transform
Count_data = CountVec.fit_transform(corpus.values.tolist())
#create dataframe
BoW_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
# print(BoW_dataframe)
return BoW_dataframe
import wordninja
def split_words(x):
return " ".join(wordninja.split(x))
def lower_string(x):
x = x.replace("&"," ")
return x.lower()
from pattern.text.en import singularize
def clean_text(text):
tokens = nltk.word_tokenize(text)
tags = nltk.pos_tag(tokens)
# nouns = [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS' or pos == "JJ" or pos == "JJR" or pos == "JJS")]
# nouns = [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS' or pos == "JJ" or pos == "JJR" or pos == "JJS" or pos[0] == "V")]
try:
# nouns = [word for word,pos in tags if (pos[0] == "N" or pos[0] == "J" or pos[1] == "V")]
# nouns = [word for word,pos in tags if (pos[0] == "N" or pos[0] == "J")]
nouns = []
for word,pos in tags:
if pos[0] == "N" and len(word) > 3:
nouns.append(singularize(word))
elif pos[0] == "N" and len(word) <= 3:
nouns.append(word)
return " ".join(nouns).lower()
except:
return "nothing"
# df_merge = pd.read_csv("./results_news/features4.csv")
column_features = ['magnitude_sum', 'magnitude_mean', 'magnitude_std',
'magnitude_median', 'magnitude_var', 'magnitude_amin', 'magnitude_amax',
'magnitude_percentile_5', 'magnitude_percentile_95',
'magnitude_percentile_10', 'magnitude_percentile_90', 'score_sum',
'score_mean', 'score_std', 'score_median', 'score_var', 'score_amin',
'score_amax', 'score_percentile_5', 'score_percentile_95',
'score_percentile_10', 'score_percentile_90', 'numMentions_sum',
'numMentions_mean', 'numMentions_std', 'numMentions_median',
'numMentions_var', 'numMentions_amin', 'numMentions_amax',
'numMentions_percentile_5', 'numMentions_percentile_95',
'numMentions_percentile_10', 'numMentions_percentile_90',
'avgSalience_sum', 'avgSalience_mean', 'avgSalience_std',
'avgSalience_median', 'avgSalience_var', 'avgSalience_amin',
'avgSalience_amax', 'avgSalience_percentile_5',
'avgSalience_percentile_95', 'avgSalience_percentile_10',
'avgSalience_percentile_90', 'overall_score_sum', 'overall_score_mean',
'overall_score_std', 'overall_score_median', 'overall_score_var',
'overall_score_amin', 'overall_score_amax',
'overall_score_percentile_5', 'overall_score_percentile_95',
'overall_score_percentile_10', 'overall_score_percentile_90']
# number = 1
all_scores_all_SDGs = []
[nltk_data] Downloading package omw-1.4 to [nltk_data] C:\Users\qhuca\AppData\Roaming\nltk_data... [nltk_data] Package omw-1.4 is already up-to-date!
number = 7
# df = pd.read_csv("./data/msci3.csv")
# df1 = df[["Company Name", "Company ID", '01. No Poverty', '02. Zero Hunger', '03. Good Health & Wellbeing', '04. Quality Education', '05. Gender Equality Description', '06. Clean Water and Sanitation', '07. Affordable Clean Energy', '08. Decent Work and Economic Growth', '09. Industry Innovation and Infrastructure', '10. Reduced Inequalities', '11. Sustainable Cities and Communities', '12. Responsible Consumption and Production', '13. Climate Action', '14. Life Below Water', '15. Life on Land', '16. Peace, Justice & Strong Institutions', '17. Partnership for the Goals']].fillna('0')
# df1 = df1.rename(columns = {"Company Name": "company"})
# df1 = df1.replace('\xa0', '0')
# df1 = df1.replace(' ', '0')
# all_sdgs = ['01. No Poverty', '02. Zero Hunger', '03. Good Health & Wellbeing', '04. Quality Education', '05. Gender Equality Description', '06. Clean Water and Sanitation', '07. Affordable Clean Energy', '08. Decent Work and Economic Growth', '09. Industry Innovation and Infrastructure', '10. Reduced Inequalities', '11. Sustainable Cities and Communities', '12. Responsible Consumption and Production', '13. Climate Action', '14. Life Below Water', '15. Life on Land', '16. Peace, Justice & Strong Institutions', '17. Partnership for the Goals']
# sdg = all_sdgs[number-1]
# df_label = df1[["company",sdg]]
# variable6 = "GICS Sector"
# variable5 = sdg
print("SDG ", number, " is calculating ...... ")
msci = pd.read_csv("./data/msci.csv")
msci2 = pd.read_csv("./data/msci2.csv").rename(columns={"SDG_03_OPS_ALIGNMENT":"SDG_03_OPER_ALIGNMENT"})
variable6 = "GICS Industry"
if number >= 10:
variable5 = "SDG_{}_PROD_ALIGNMENT".format(number)
else:
variable5 = "SDG_0{}_PROD_ALIGNMENT".format(number) # another thing
SDG1 = msci[["Company Name", "Company ID"]].dropna()
SDG2 = msci2[["ISSUER_NAME", "Figi", variable5]].dropna()
df_label = SDG1.merge(SDG2, left_on="Company ID", right_on="Figi")[["Company Name", variable5]]
df_label = df_label.rename(columns = {"Company Name": "company"})
df_sector = pd.read_csv("./data/Fundamental.csv")[["Company Name",variable6]].rename(columns={"Company Name": "company"})
# df_merge2 = df_merge.merge(df_sector,on="company", how="right")
df_merge3 = df_sector.merge(df_label)
# added
df_wiki = pd.read_csv("./temp_data/wiki/wiki_product_info.csv",sep="\t")
df_merge3 = df_merge3.merge(df_wiki[["company","product_info"]],on="company").dropna()
encoded_dict = {"Strongly Misaligned":0,'Misaligned':1,"Neutral":2,"Aligned":3,"Strongly Aligned":4}
df_merge3[variable5] = df_merge3[variable5].map(encoded_dict)
df_merge3["merged_text"] = df_merge3[variable6] + " " + df_merge3["product_info"]
# features = df_merge3["product_info"]
df_merge3["merged_text"] = df_merge3["merged_text"].progress_apply(split_words)
df_merge3["merged_text"] = df_merge3["merged_text"].progress_apply(lower_string)
df_merge3["merged_text"] = df_merge3["merged_text"].progress_apply(clean_text)
df_merge3 = df_merge3[df_merge3.merged_text.str.len()>100]
tmp = df_merge3[variable5].value_counts().reset_index()
index = tmp[tmp[variable5]==1]["index"].values
df_merge3 = df_merge3[~df_merge3[variable5].isin(index)]
features = df_merge3["merged_text"]
labels = df_merge3[variable5]
SDG 7 is calculating ......
100%|█████████████████████████████████████████████████████████████████████████████| 1063/1063 [00:04<00:00, 221.33it/s] 100%|██████████████████████████████████████████████████████████████████████████| 1063/1063 [00:00<00:00, 529644.23it/s] 100%|██████████████████████████████████████████████████████████████████████████████| 1063/1063 [00:11<00:00, 89.47it/s]
y = labels
# X = pd.concat([features1.reset_index(drop=True),features4.reset_index(drop=True)], axis=1)
X = features
from sklearn import preprocessing
# Encode for string labels
label_encoder = preprocessing.LabelEncoder().fit(y)
y = label_encoder.transform(y)
all_unique_labels = pd.Series(y).unique()
all_unique_labels.sort()
pd.Series(y).value_counts()
2 886 0 68 3 38 1 31 4 19 dtype: int64
labels.value_counts()
2 886 0 68 3 38 1 31 4 19 Name: SDG_07_PROD_ALIGNMENT, dtype: int64
from sklearn.model_selection import train_test_split
import xgboost
import sklearn
# import lightgbm as lgb
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(y_train.shape, y_test.shape)
(833,) (209,)
pd.Series(y_test).value_counts()
2 178 0 14 3 7 1 6 4 4 dtype: int64
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(stop_words = "english", ngram_range=(1,2), max_df=0.7, min_df=0.01)
# , max_df=0.7, min_df=0.05
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
from sklearn.base import clone, BaseEstimator, ClassifierMixin
class OrdinalClassifier():
def __init__(self, clf):
self.clf = clf
self.clfs = {}
def fit(self, X, y):
self.unique_class = np.sort(np.unique(y))
if self.unique_class.shape[0] > 2:
for i in range(self.unique_class.shape[0]-1):
# for each k - 1 ordinal value we fit a binary classification problem
binary_y = (y > self.unique_class[i]).astype(np.uint8)
clf = clone(self.clf)
clf.fit(X, binary_y)
self.clfs[i] = clf
def predict_proba(self, X):
clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs}
predicted = []
for i, y in enumerate(self.unique_class):
if i == 0:
# V1 = 1 - Pr(y > V1)
predicted.append(1 - clfs_predict[i][:,1])
elif i in clfs_predict:
# Vi = Pr(y > Vi-1) - Pr(y > Vi)
predicted.append(clfs_predict[i-1][:,1] - clfs_predict[i][:,1])
else:
# Vk = Pr(y > Vk-1)
predicted.append(clfs_predict[i-1][:,1])
return np.vstack(predicted).T
def predict(self, X):
return np.argmax(self.predict_proba(X), axis=1)
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(train_vectors, y_train)
nb = MultinomialNB(alpha=.01)
nb.fit(X_resampled, y_resampled)
# nb = OrdinalClassifier(nb)
# nb.fit(X_resampled, y_resampled)
MultinomialNB(alpha=0.01)
y_pred = nb.predict(test_vectors)
sklearn.metrics.f1_score(y_test, y_pred, average='weighted')
0.8609466749878649
# Explaining predictions using lime
from lime import lime_text
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizer, nb)
print(c.predict_proba([X_test.values[8]]).round(3))
[[0.022 0. 0.971 0.007 0. ]]
class_names = all_unique_labels
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)
examples = []
for element in class_names:
A = np.where(np.equal(y_pred, y_test))[0]
B = np.where(np.equal(y_test, element))[0]
mask = np.in1d(A, B)
try:
examples.append(A[np.where(mask)[0]][0:3])
except:
continue
from itertools import chain
examples = list(chain.from_iterable(examples))
pd.Series(y).value_counts()
2 886 0 68 3 38 1 31 4 19 dtype: int64
labels.value_counts()
2 886 0 68 3 38 1 31 4 19 Name: SDG_07_PROD_ALIGNMENT, dtype: int64
for element in examples:
idx=element
exp = explainer.explain_instance(X_test.values[idx], c.predict_proba, num_features=6)
print('Document id: %d' % idx)
print('Predicted class =', class_names[nb.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % class_names[y_test[idx]])
exp = explainer.explain_instance(X_test.values[idx], c.predict_proba, num_features=6, top_labels=2)
print('Most possible two classes: %s' % exp.available_labels())
exp.show_in_notebook(text=True)
Document id: 19 Predicted class = 0 True class: 0 Most possible two classes: [0, 2]
Document id: 33 Predicted class = 0 True class: 0 Most possible two classes: [0, 1]
Document id: 56 Predicted class = 0 True class: 0 Most possible two classes: [0, 1]
Document id: 1 Predicted class = 2 True class: 2 Most possible two classes: [2, 0]
Document id: 2 Predicted class = 2 True class: 2 Most possible two classes: [2, 0]
Document id: 3 Predicted class = 2 True class: 2 Most possible two classes: [2, 0]
# print ('Explanation for class %s' % class_names[0])
# print ('\n'.join(map(str, exp.as_list(label=0))))
# print ()
# print ('Explanation for class %s' % class_names[2])
# print ('\n'.join(map(str, exp.as_list(label=2))))
# print ()
# print ('Explanation for class %s' % class_names[4])
# print ('\n'.join(map(str, exp.as_list(label=4))))
# exp.show_in_notebook(text=True)
# exp.show_in_notebook(text=X_test.values[idx], labels=(0,))
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
from lime import submodular_pick
sp_obj = submodular_pick.SubmodularPick(explainer, X_train.values, c.predict_proba, sample_size=500, num_features=10, num_exps_desired=5)
#Plot the 5 explanations
# [exp.as_pyplot_figure(label=exp.available_labels()[0]) for exp in sp_obj.sp_explanations];
# Make it into a dataframe
W_pick=pd.DataFrame([dict(this.as_list(this.available_labels()[0])) for this in sp_obj.sp_explanations]).fillna(0)
W_pick['prediction'] = [this.available_labels()[0] for this in sp_obj.sp_explanations]
#Making a dataframe of all the explanations of sampled points
W=pd.DataFrame([dict(this.as_list(this.available_labels()[0])) for this in sp_obj.explanations]).fillna(0)
W['prediction'] = [this.available_labels()[0] for this in sp_obj.explanations]
#Plotting the aggregate importances
np.abs(W.drop("prediction", axis=1)).mean(axis=0).sort_values(ascending=False).head(
25
).sort_values(ascending=True).iplot(kind="barh")
#Aggregate importances split by classes
grped_coeff = W.groupby("prediction").mean()
grped_coeff = grped_coeff.T
grped_coeff["abs"] = np.abs(grped_coeff.iloc[:, 0])
grped_coeff.sort_values("abs", inplace=True, ascending=False)
grped_coeff.head(25).sort_values("abs", ascending=True).drop("abs", axis=1).iplot(
kind="barh", bargap=0.5
)
pd.Series(y).value_counts()
2 886 0 68 3 38 1 31 4 19 dtype: int64
labels.value_counts()
2 886 0 68 3 38 1 31 4 19 Name: SDG_07_PROD_ALIGNMENT, dtype: int64
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)
array([[ 9, 0, 4, 1, 0],
[ 0, 0, 6, 0, 0],
[ 0, 0, 178, 0, 0],
[ 0, 0, 7, 0, 0],
[ 0, 2, 2, 0, 0]], dtype=int64)